suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene_CDS/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/Metagene_CDS/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
paste_wd <- function(path) {
paste0(wd, path)
}
read_CDS_annotation_bed <- function() {
read_bed12(
'/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA_CDS_transcriptome.bed'
) |>
select(chrom, start, end, thickStart, thickEnd) |>
dplyr::rename(transcript_id = chrom)
}
remove_noCDSinfo_RNAs <- function(df) {
df |>
filter(!is.na(thickStart) & !is.na(thickEnd))
}
determine_kmer_region <- function(df) {
df |>
mutate(
kmer_region = case_when(
is.na(thickStart) | is.na(thickEnd) ~ NA,
kmer_middle <= thickStart ~ 'fiveprimeUTR',
kmer_middle <= thickEnd ~ 'CDS',
kmer_middle <= length ~ 'threeprimeUTR',
.default = NA
)
)
}
calc_percent_m3C_in_region <- function(df) {
df |>
group_by(kmer_region, genetype2) |>
reframe(num_m3C = n()) |>
group_by(genetype2) |>
mutate(percent_m3C = 100 * num_m3C / sum(num_m3C))
}
calc_region_length <- function(df) {
df |>
mutate(
fiveprimeUTR_length = thickStart,
CDS_length = thickEnd - thickStart,
threeprimeUTR_length = length - thickEnd
) |>
select(transcript_id, genetype2, ends_with('length')) |>
distinct() |>
pivot_longer(
cols = -c(transcript_id, genetype2, length),
names_pattern = '(.*)_length',
names_to = 'kmer_region', values_to = 'region_length'
)
}
calc_percent_region_length <- function(df) {
df |>
group_by(genetype2, kmer_region) |>
reframe(
sum_length = sum(region_length),
mean_length = mean(region_length, na.rm = TRUE),
median_length = median(region_length, na.rm = TRUE),
max_length = max(region_length, na.rm = TRUE)
) |>
group_by(genetype2) |>
mutate(percent_length = 100 * sum_length / sum(sum_length))
}
calc_relposition_within_region <- function(df) {
df |>
mutate(
rel_position_within_region = case_when(
kmer_region == 'fiveprimeUTR' ~ kmer_middle / thickStart,
kmer_region == 'CDS' ~ (kmer_middle - thickStart) / (thickEnd - thickStart),
kmer_region == 'threeprimeUTR' ~ (kmer_middle - thickEnd) / length,
.default = NA
)
) |>
mutate(
rel_position_metagene = case_when(
kmer_region == 'fiveprimeUTR' ~
mRNA_length_percentage$fiveprimeUTR * rel_position_within_region,
kmer_region == 'CDS' ~
mRNA_length_percentage$fiveprimeUTR +
mRNA_length_percentage$CDS * rel_position_within_region,
kmer_region == 'threeprimeUTR' ~
mRNA_length_percentage$fiveprimeUTR +
mRNA_length_percentage$CDS +
mRNA_length_percentage$threeprimeUTR * rel_position_within_region,
.default = NA
)
)
}
calc_base_position <- function(df) {
df |>
mutate(transcript_seq = str_split(transcript_seq, '')) |>
unnest(transcript_seq) |>
group_by(transcript_id) |>
mutate(position = row_number() - min(row_number()) + 1) |>
ungroup() |>
dplyr::rename(base = transcript_seq)
}
calc_CC_position <- function(df) {
df |>
mutate(position = str_locate_all(transcript_seq, 'CC')) |>
unnest(position) |>
mutate(position = (position[,1] + position[,2]) / 2) |>
select(transcript_id, position)
}
plot_metagene_distribution_different_adjustment <- function(adjust_value) {
# metagene plot
m3C_relposition_mRNA_metageneplot <-
allC_m3C_mRNA_relposition_bound |>
ggplot(aes(x = rel_position_metagene, colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
geom_vline(
xintercept = c(
mRNA_length_percentage$fiveprimeUTR,
mRNA_length_percentage$fiveprimeUTR + mRNA_length_percentage$CDS
), colour = 'gray20'
) +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_x_continuous(limits = c(0, 100)) +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
m3C_relposition_mRNA_metageneplot |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_metageneplot_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# Near start codon
m3C_relposition_mRNA_nearstartcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region != 'threeprimeUTR') |>
ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
#scale_x_continuous(limits = c(-500, 1000)) +
geom_rect(aes(xmin = 0, xmax = 2, ymin = 0, ymax = .002), alpha = 1/2) +
#geom_vline(xintercept = 0, colour = 'gray20') +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
m3C_relposition_mRNA_nearstartcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_nearstartcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# Near stop codon
m3C_relposition_mRNA_nearstopcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region != 'fiveprimeUTR') |>
ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
#scale_x_continuous(limits = c(-2000, 2000)) +
geom_vline(xintercept = 0, colour = 'gray20') +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
m3C_relposition_mRNA_nearstopcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_nearstopcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# 5UTR near start codon
fiveutr_startcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'fiveprimeUTR') |>
ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
geom_density(adjust = adjust_value) +
geom_vline(xintercept = c(-25, -10, 0), colour = 'gray20') +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_x_continuous(limits = c(-200, 0)) +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
fiveutr_startcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_5UTR_nearstartcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# CDS near start codon
fiveutr_startcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'CDS') |>
ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
geom_vline(xintercept = c(0, 80, 350), colour = 'gray') +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
fiveutr_startcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_CDS_nearstartcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# CDS near stop codon
fiveutr_startcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'CDS') |>
ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
geom_vline(xintercept = c(0), colour = 'gray') +
geom_hline(yintercept = 0, colour = 'gray') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
fiveutr_startcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_CDS_nearstopcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
# 3UTR near stop codon
fiveutr_startcodon <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'threeprimeUTR') |>
ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
geom_density(adjust = adjust_value, lwd = 1.1) +
geom_vline(xintercept = c(0), colour = 'gray') +
geom_hline(yintercept = 0, colour = 'gray') +
scale_x_continuous(limits = c(0, 2000)) +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
fiveutr_startcodon |>
ggsave_multiple_formats(
basename = paste0('m3C_relposition_mRNA_3UTR_nearstopcodon_adjust_', adjust_value),
outdir = figdir, width = 4, height = 4, fontsize = 7
)
}
DRS_methylated_positions <-
read_tsv(
'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |>
paste_wd()
)
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions
## # A tibble: 489 × 13
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCCAC 149 153
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCC 154 158
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCC 155 159
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCA 156 160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
gencode_annotation_CDS <-
read_CDS_annotation_bed()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
gencode_annotation_CDS
## # A tibble: 111,324 × 5
## transcript_id start end thickStart thickEnd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000003912.7 0 5481 715 1687
## 2 ENST00000008440.9 0 1667 128 749
## 3 ENST00000009105.5 0 2612 245 1673
## 4 ENST00000010299.10 0 1050 24 1047
## 5 ENST00000011700.10 0 10969 0 9629
## 6 ENST00000054650.9 0 1361 156 873
## 7 ENST00000054666.11 0 2178 88 388
## 8 ENST00000078527.9 0 2066 160 1639
## 9 ENST00000164247.5 0 4273 564 1665
## 10 ENST00000166244.8 0 5019 147 3162
## # ℹ 111,314 more rows
espresso_transcript_seqs <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |>
paste_wd()
) |>
select(-transcript_length)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_transcript_seqs
## # A tibble: 36,717 × 2
## transcript_id transcript_seq
## <chr> <chr>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTGGCGTGAG…
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAAGACCAAC…
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACCTTTTTAA…
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGGGATTCTA…
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTGCCTGCAG…
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCT…
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGC…
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCTACCAGTG…
## # ℹ 36,707 more rows
DRS_methylated_RNAs <-
DRS_methylated_positions |>
select(transcript_id) |>
distinct()
DRS_methylated_RNAs
## # A tibble: 71 × 1
## transcript_id
## <chr>
## 1 ENST00000429711.7
## 2 ENST00000647248.2
## 3 ENST00000389680.2
## 4 ENST00000361390.2
## 5 ENST00000361453.3
## 6 ENST00000387347.2
## 7 ENST00000361624.2
## 8 ENST00000361739.1
## 9 ENST00000361899.2
## 10 ENST00000361227.2
## # ℹ 61 more rows
DRS_methylated_RNAs_annotation <-
DRS_methylated_positions |>
select(starts_with('gene'), starts_with('transcript')) |>
distinct()
DRS_methylated_RNAs_annotation
## # A tibble: 71 × 4
## gene_name gene_type genetype2 transcript_id
## <chr> <chr> <chr> <chr>
## 1 RPL32 protein_coding mRNA ENST00000429711.7
## 2 RPL35A protein_coding mRNA ENST00000647248.2
## 3 MT-RNR1 Mt_rRNA Mt_rRNA ENST00000389680.2
## 4 MT-ND1 protein_coding mt-mRNA ENST00000361390.2
## 5 MT-ND2 protein_coding mt-mRNA ENST00000361453.3
## 6 MT-RNR2 Mt_rRNA Mt_rRNA ENST00000387347.2
## 7 MT-CO1 protein_coding mt-mRNA ENST00000361624.2
## 8 MT-CO2 protein_coding mt-mRNA ENST00000361739.1
## 9 MT-ATP6 protein_coding mt-mRNA ENST00000361899.2
## 10 MT-ND3 protein_coding mt-mRNA ENST00000361227.2
## # ℹ 61 more rows
methylated_RNAs_base_positions <-
espresso_transcript_seqs |>
right_join(DRS_methylated_RNAs) |>
calc_base_position()
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_base_positions
## # A tibble: 101,437 × 3
## transcript_id base position
## <chr> <chr> <dbl>
## 1 ENST00000429711.7 A 1
## 2 ENST00000429711.7 G 2
## 3 ENST00000429711.7 C 3
## 4 ENST00000429711.7 C 4
## 5 ENST00000429711.7 C 5
## 6 ENST00000429711.7 T 6
## 7 ENST00000429711.7 T 7
## 8 ENST00000429711.7 G 8
## 9 ENST00000429711.7 C 9
## 10 ENST00000429711.7 G 10
## # ℹ 101,427 more rows
methylated_RNAs_C_positions <-
methylated_RNAs_base_positions |>
filter(base == 'C')
methylated_RNAs_C_positions
## # A tibble: 24,117 × 3
## transcript_id base position
## <chr> <chr> <dbl>
## 1 ENST00000429711.7 C 3
## 2 ENST00000429711.7 C 4
## 3 ENST00000429711.7 C 5
## 4 ENST00000429711.7 C 9
## 5 ENST00000429711.7 C 11
## 6 ENST00000429711.7 C 13
## 7 ENST00000429711.7 C 14
## 8 ENST00000429711.7 C 16
## 9 ENST00000429711.7 C 17
## 10 ENST00000429711.7 C 20
## # ℹ 24,107 more rows
DRS_methylated_positions_CDSpos <-
DRS_methylated_positions |>
left_join(gencode_annotation_CDS)
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_CDSpos |>
remove_noCDSinfo_RNAs() |>
determine_kmer_region() |>
export_tsv(outdir = tabledir, basename = 'DRS_methylated_positions_CDSpos')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-07-29.tsv
## # A tibble: 436 × 18
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCA 33 37
## 5 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCT 123 127
## 6 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 141 145
## 7 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCG 186 190
## 8 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 205 209
## 9 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCC 260 264
## 10 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCTC 322 326
## # ℹ 426 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>
allC_methylatedRNAs_regioninfo <-
methylated_RNAs_C_positions |>
left_join(gencode_annotation_CDS) |>
dplyr::rename(kmer_middle = position) |>
mutate(length = end) |>
remove_noCDSinfo_RNAs() |>
determine_kmer_region() |>
left_join(DRS_methylated_RNAs_annotation)
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
allC_methylatedRNAs_regioninfo |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/allC_methylatedRNAs_regioninfo_2024-07-29.tsv
## # A tibble: 22,334 × 12
## transcript_id base kmer_middle start end thickStart thickEnd length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000429711.7 C 3 0 2094 77 482 2094
## 2 ENST00000429711.7 C 4 0 2094 77 482 2094
## 3 ENST00000429711.7 C 5 0 2094 77 482 2094
## 4 ENST00000429711.7 C 9 0 2094 77 482 2094
## 5 ENST00000429711.7 C 11 0 2094 77 482 2094
## 6 ENST00000429711.7 C 13 0 2094 77 482 2094
## 7 ENST00000429711.7 C 14 0 2094 77 482 2094
## 8 ENST00000429711.7 C 16 0 2094 77 482 2094
## 9 ENST00000429711.7 C 17 0 2094 77 482 2094
## 10 ENST00000429711.7 C 20 0 2094 77 482 2094
## # ℹ 22,324 more rows
## # ℹ 4 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## # genetype2 <chr>
allC_percentage_groupedby_region <-
allC_methylatedRNAs_regioninfo |>
group_by(genetype2, kmer_region) |>
reframe(n = n()) |>
group_by(genetype2) |>
mutate(percent_C = 100 * n / sum(n))
allC_percentage_groupedby_region
## # A tibble: 4 × 4
## # Groups: genetype2 [2]
## genetype2 kmer_region n percent_C
## <chr> <chr> <int> <dbl>
## 1 mRNA CDS 7490 38.4
## 2 mRNA fiveprimeUTR 1845 9.45
## 3 mRNA threeprimeUTR 10184 52.2
## 4 mt-mRNA CDS 2815 100
m3C_percentage_region <-
DRS_methylated_positions_CDSpos |>
remove_noCDSinfo_RNAs() |>
determine_kmer_region() |>
calc_percent_m3C_in_region() |>
arrange(genetype2)
m3C_percentage_region
## # A tibble: 4 × 4
## # Groups: genetype2 [2]
## kmer_region genetype2 num_m3C percent_m3C
## <chr> <chr> <int> <dbl>
## 1 CDS mRNA 179 70.5
## 2 fiveprimeUTR mRNA 30 11.8
## 3 threeprimeUTR mRNA 45 17.7
## 4 CDS mt-mRNA 182 100
length_percentage_region <-
DRS_methylated_positions_CDSpos |>
remove_noCDSinfo_RNAs() |>
calc_region_length() |>
calc_percent_region_length()
length_percentage_region
## # A tibble: 6 × 7
## # Groups: genetype2 [2]
## genetype2 kmer_region sum_length mean_length median_length max_length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 mRNA CDS 29715 531. 406. 1659
## 2 mRNA fiveprimeUTR 5341 95.4 70 433
## 3 mRNA threeprimeUTR 50435 901. 248. 7245
## 4 mt-mRNA CDS 8548 950. 956 1542
## 5 mt-mRNA fiveprimeUTR 0 0 0 0
## 6 mt-mRNA threeprimeUTR 6 0.667 0 3
## # ℹ 1 more variable: percent_length <dbl>
length_m3Csites_percentage_groupedby_region <-
full_join(m3C_percentage_region, length_percentage_region) |>
full_join(allC_percentage_groupedby_region) |>
select(kmer_region, genetype2, percent_m3C, percent_C, percent_length) |>
pivot_longer(
cols = starts_with('percent_'),
names_prefix = 'percent_', values_to = 'percent')
## Joining with `by = join_by(kmer_region, genetype2)`
## Joining with `by = join_by(kmer_region, genetype2)`
length_m3Csites_percentage_groupedby_region
## # A tibble: 18 × 4
## # Groups: genetype2 [2]
## kmer_region genetype2 name percent
## <chr> <chr> <chr> <dbl>
## 1 CDS mRNA m3C 70.5
## 2 CDS mRNA C 38.4
## 3 CDS mRNA length 34.8
## 4 fiveprimeUTR mRNA m3C 11.8
## 5 fiveprimeUTR mRNA C 9.45
## 6 fiveprimeUTR mRNA length 6.25
## 7 threeprimeUTR mRNA m3C 17.7
## 8 threeprimeUTR mRNA C 52.2
## 9 threeprimeUTR mRNA length 59.0
## 10 CDS mt-mRNA m3C 100
## 11 CDS mt-mRNA C 100
## 12 CDS mt-mRNA length 99.9
## 13 fiveprimeUTR mt-mRNA m3C NA
## 14 fiveprimeUTR mt-mRNA C NA
## 15 fiveprimeUTR mt-mRNA length 0
## 16 threeprimeUTR mt-mRNA m3C NA
## 17 threeprimeUTR mt-mRNA C NA
## 18 threeprimeUTR mt-mRNA length 0.0701
mRNA_length_percentage <-
length_percentage_region |>
filter(genetype2 == 'mRNA') |>
pivot_wider(id_cols = c(genetype2), names_from = kmer_region, values_from = percent_length)
mRNA_length_percentage
## # A tibble: 1 × 4
## # Groups: genetype2 [1]
## genetype2 CDS fiveprimeUTR threeprimeUTR
## <chr> <dbl> <dbl> <dbl>
## 1 mRNA 34.8 6.25 59.0
DRS_methylated_positions_CDSpos_regioninfo <-
DRS_methylated_positions_CDSpos |>
filter(genetype2 == 'mRNA') |>
determine_kmer_region()
DRS_methylated_positions_CDSpos_regioninfo
## # A tibble: 257 × 18
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000215754.8 MIF chr22 protein_cod… GTCCT 79 83
## 5 ENST00000215754.8 MIF chr22 protein_cod… GCCAC 180 184
## 6 ENST00000215754.8 MIF chr22 protein_cod… GCCCC 191 195
## 7 ENST00000215754.8 MIF chr22 protein_cod… ACCCG 484 488
## 8 ENST00000199764.7 CEACAM6 chr19 protein_cod… TTCAG 1698 1702
## 9 ENST00000270625.7 RPS11 chr19 protein_cod… ACCCA 161 165
## 10 ENST00000270625.7 RPS11 chr19 protein_cod… CACCA 473 477
## # ℹ 247 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>
m3C_relposition_mRNA_metagene <-
DRS_methylated_positions_CDSpos_regioninfo |>
calc_relposition_within_region()
m3C_relposition_mRNA_metagene
## # A tibble: 257 × 20
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000215754.8 MIF chr22 protein_cod… GTCCT 79 83
## 5 ENST00000215754.8 MIF chr22 protein_cod… GCCAC 180 184
## 6 ENST00000215754.8 MIF chr22 protein_cod… GCCCC 191 195
## 7 ENST00000215754.8 MIF chr22 protein_cod… ACCCG 484 488
## 8 ENST00000199764.7 CEACAM6 chr19 protein_cod… TTCAG 1698 1702
## 9 ENST00000270625.7 RPS11 chr19 protein_cod… ACCCA 161 165
## 10 ENST00000270625.7 RPS11 chr19 protein_cod… CACCA 473 477
## # ℹ 247 more rows
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## # kmer_region <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>
allC_methylatedRNAs_regioninfo_relposition <-
allC_methylatedRNAs_regioninfo |>
calc_relposition_within_region()
allC_methylatedRNAs_regioninfo_relposition
## # A tibble: 22,334 × 14
## transcript_id base kmer_middle start end thickStart thickEnd length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000429711.7 C 3 0 2094 77 482 2094
## 2 ENST00000429711.7 C 4 0 2094 77 482 2094
## 3 ENST00000429711.7 C 5 0 2094 77 482 2094
## 4 ENST00000429711.7 C 9 0 2094 77 482 2094
## 5 ENST00000429711.7 C 11 0 2094 77 482 2094
## 6 ENST00000429711.7 C 13 0 2094 77 482 2094
## 7 ENST00000429711.7 C 14 0 2094 77 482 2094
## 8 ENST00000429711.7 C 16 0 2094 77 482 2094
## 9 ENST00000429711.7 C 17 0 2094 77 482 2094
## 10 ENST00000429711.7 C 20 0 2094 77 482 2094
## # ℹ 22,324 more rows
## # ℹ 6 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## # genetype2 <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>
methylated_RNAs_CC_positions <-
espresso_transcript_seqs |>
right_join(DRS_methylated_RNAs) |>
calc_CC_position()
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_relpositions <-
methylated_RNAs_CC_positions |>
left_join(gencode_annotation_CDS) |>
dplyr::rename(kmer_middle = position) |>
mutate(length = end) |>
remove_noCDSinfo_RNAs() |>
determine_kmer_region() |>
left_join(DRS_methylated_RNAs_annotation) |>
calc_relposition_within_region() |>
mutate(type = 'all CC')
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_relpositions
## # A tibble: 5,081 × 14
## transcript_id kmer_middle start end thickStart thickEnd length kmer_region
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 ENST000004297… 3.5 0 2094 77 482 2094 fiveprimeU…
## 2 ENST000004297… 13.5 0 2094 77 482 2094 fiveprimeU…
## 3 ENST000004297… 16.5 0 2094 77 482 2094 fiveprimeU…
## 4 ENST000004297… 20.5 0 2094 77 482 2094 fiveprimeU…
## 5 ENST000004297… 32.5 0 2094 77 482 2094 fiveprimeU…
## 6 ENST000004297… 43.5 0 2094 77 482 2094 fiveprimeU…
## 7 ENST000004297… 59.5 0 2094 77 482 2094 fiveprimeU…
## 8 ENST000004297… 65.5 0 2094 77 482 2094 fiveprimeU…
## 9 ENST000004297… 82.5 0 2094 77 482 2094 CDS
## 10 ENST000004297… 85.5 0 2094 77 482 2094 CDS
## # ℹ 5,071 more rows
## # ℹ 6 more variables: gene_name <chr>, gene_type <chr>, genetype2 <chr>,
## # rel_position_within_region <dbl>, rel_position_metagene <dbl>, type <chr>
allC_m3C_mRNA_relposition_bound <-
allC_methylatedRNAs_regioninfo_relposition |>
filter(genetype2 == 'mRNA') |>
mutate(type = 'allC') |>
bind_rows(m3C_relposition_mRNA_metagene |> mutate(type = 'm3C')) |>
bind_rows(methylated_RNAs_CC_relpositions) |>
filter(genetype2 == 'mRNA')
allC_m3C_mRNA_relposition_bound |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/allC_m3C_mRNA_relposition_bound_2024-07-29.tsv.gz
## # A tibble: 24,144 × 22
## transcript_id base kmer_middle start end thickStart thickEnd length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000429711.7 C 3 0 2094 77 482 2094
## 2 ENST00000429711.7 C 4 0 2094 77 482 2094
## 3 ENST00000429711.7 C 5 0 2094 77 482 2094
## 4 ENST00000429711.7 C 9 0 2094 77 482 2094
## 5 ENST00000429711.7 C 11 0 2094 77 482 2094
## 6 ENST00000429711.7 C 13 0 2094 77 482 2094
## 7 ENST00000429711.7 C 14 0 2094 77 482 2094
## 8 ENST00000429711.7 C 16 0 2094 77 482 2094
## 9 ENST00000429711.7 C 17 0 2094 77 482 2094
## 10 ENST00000429711.7 C 20 0 2094 77 482 2094
## # ℹ 24,134 more rows
## # ℹ 14 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## # genetype2 <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>, type <chr>, seqname <chr>, ref_kmer <chr>,
## # kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## # rel_kmer_middle <dbl>, rel_kmer_end <dbl>
length_m3Csites_percentage_groupedby_region |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/length_m3Csites_percentage_groupedby_region_2024-07-29.tsv
## # A tibble: 18 × 4
## # Groups: genetype2 [2]
## kmer_region genetype2 name percent
## <chr> <chr> <chr> <dbl>
## 1 CDS mRNA m3C 70.5
## 2 CDS mRNA C 38.4
## 3 CDS mRNA length 34.8
## 4 fiveprimeUTR mRNA m3C 11.8
## 5 fiveprimeUTR mRNA C 9.45
## 6 fiveprimeUTR mRNA length 6.25
## 7 threeprimeUTR mRNA m3C 17.7
## 8 threeprimeUTR mRNA C 52.2
## 9 threeprimeUTR mRNA length 59.0
## 10 CDS mt-mRNA m3C 100
## 11 CDS mt-mRNA C 100
## 12 CDS mt-mRNA length 99.9
## 13 fiveprimeUTR mt-mRNA m3C NA
## 14 fiveprimeUTR mt-mRNA C NA
## 15 fiveprimeUTR mt-mRNA length 0
## 16 threeprimeUTR mt-mRNA m3C NA
## 17 threeprimeUTR mt-mRNA C NA
## 18 threeprimeUTR mt-mRNA length 0.0701
percentage_m3Csites_groupedby_region <-
length_m3Csites_percentage_groupedby_region |>
mutate(
kmer_region = factor(kmer_region, levels = c('fiveprimeUTR', 'CDS', 'threeprimeUTR')),
name = factor(name, levels = c('length', 'C', 'm3C'))
) |>
ggplot(aes(x = name, y = percent, fill = kmer_region)) +
geom_bar(stat = 'identity') +
coord_flip() +
scale_y_reverse() +
scale_fill_manual(values = c('#90d2d8', '#f6a6b2', '#ffecb8')) +
facet_wrap( ~ genetype2, ncol = 1, scales = 'free')
percentage_m3Csites_groupedby_region |>
ggsave_multiple_formats(
outdir = figdir, width = 4, height = 4.5, fontsize = 7
)
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
adjust_value_list <- c(1/10, 1/5, 1/2, 1, 2, 5, 10)
adjust_value_list |>
map(plot_metagene_distribution_different_adjustment)
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## [[1]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[2]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[3]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[4]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[5]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[6]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
##
## [[7]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
plot_metagene_distribution_oneregion_dif_adjust <- function(.region, .adjust_val) {
plot_basename <- paste0('metageneplot_', .region, '_adjust_', .adjust_val)
metageneplot_region <-
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == .region) |>
ggplot(aes(x = rel_position_within_region, colour = type)) +
geom_density(adjust = .adjust_val, lwd = 1.1) +
# geom_vline(xintercept = c(0), colour = 'gray20') +
geom_hline(yintercept = 0, colour = 'gray20') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
metageneplot_region |>
ggsave_multiple_formats(
outdir = figdir, basename = plot_basename,
width = 4, height = 4, fontsize = 7
)
}
unique_regions <- c("fiveprimeUTR", "CDS", "threeprimeUTR")
map2(unique_regions, 0.5, plot_metagene_distribution_oneregion_dif_adjust)
## [[1]]
##
## [[2]]
##
## [[3]]
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'CDS') |>
ggplot(aes(x = rel_position_within_region, colour = type)) +
geom_density(adjust = .5) +
# geom_vline(xintercept = c(0), colour = 'gray20') +
# geom_hline(yintercept = 0, colour = 'gray20') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
allC_m3C_mRNA_relposition_bound |>
filter(kmer_region == 'CDS') |>
ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
geom_density(adjust = 1/2) +
geom_vline(xintercept = c(0, 80, 350), colour = 'gray') +
geom_hline(yintercept = 0, colour = 'gray') +
scale_color_manual(values = c('gray', 'blue', 'red'))
# allC_m3C_mRNA_relposition_bound |>
# filter(kmer_region == 'CDS') |>
# ggplot(aes(x = rel_position, colour = type)) +
# geom_density(adjust = .5) +
# geom_vline(xintercept = 0, colour = 'gray20') +
# geom_hline(yintercept = 0, colour = 'gray20') +
# scale_color_manual(values = c('blue', 'red'))
m3C_relposition_mRNA_metagene |>
group_by((length - thickEnd) < 248) |>
reframe(n = n(), num_m3C_3UTR = sum(kmer_region == 'threeprimeUTR'))
## # A tibble: 3 × 3
## `(length - thickEnd) < 248` n num_m3C_3UTR
## <lgl> <int> <int>
## 1 FALSE 117 36
## 2 TRUE 137 9
## 3 NA 3 NA
m3C_relposition_mRNA_metagene |>
group_by(thickStart < 70) |>
reframe(n = n(), num_m3C_5UTR = sum(kmer_region == 'fiveprimeUTR'))
## # A tibble: 3 × 3
## `thickStart < 70` n num_m3C_5UTR
## <lgl> <int> <int>
## 1 FALSE 136 25
## 2 TRUE 118 5
## 3 NA 3 NA
m3C_relposition_mRNA_metagene |>
group_by(thickEnd - thickStart < 406) |>
reframe(n = n(), num_m3C_CDS = sum(kmer_region == 'CDS'))
## # A tibble: 3 × 3
## `thickEnd - thickStart < 406` n num_m3C_CDS
## <lgl> <int> <int>
## 1 FALSE 152 127
## 2 TRUE 102 52
## 3 NA 3 NA
allC_m3C_mRNA_relposition_bound |>
group_by(type, kmer_region) |>
reframe(n = n())
## # A tibble: 10 × 3
## type kmer_region n
## <chr> <chr> <int>
## 1 all CC CDS 1730
## 2 all CC fiveprimeUTR 479
## 3 all CC threeprimeUTR 2159
## 4 allC CDS 7490
## 5 allC fiveprimeUTR 1845
## 6 allC threeprimeUTR 10184
## 7 m3C CDS 179
## 8 m3C fiveprimeUTR 30
## 9 m3C threeprimeUTR 45
## 10 m3C <NA> 3
Frame
allC_m3C_mRNA_relposition_bound |>
filter(type != 'all CC') |>
filter(!is.na(kmer_region)) |>
mutate(frame = (kmer_middle - (thickEnd + 1) ) %% 3) |>
group_by(frame, kmer_region, type, genetype2) |>
reframe(n = n()) |>
group_by(kmer_region, type, genetype2) |>
mutate(percentage = 100 * n /sum(n)) |>
pivot_wider(
id_cols = c(kmer_region, type, genetype2),
names_from = frame, names_prefix = 'frame_',
values_from = percentage
)
## # A tibble: 6 × 6
## # Groups: kmer_region, type, genetype2 [6]
## kmer_region type genetype2 frame_0 frame_1 frame_2
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 CDS allC mRNA 27.3 28.7 44.0
## 2 CDS m3C mRNA 20.7 26.3 53.1
## 3 fiveprimeUTR allC mRNA 29.1 33.6 37.3
## 4 fiveprimeUTR m3C mRNA 26.7 30 43.3
## 5 threeprimeUTR allC mRNA 32.8 33.7 33.5
## 6 threeprimeUTR m3C mRNA 28.9 26.7 44.4
allC_m3C_mRNA_relposition_bound
## # A tibble: 24,144 × 22
## transcript_id base kmer_middle start end thickStart thickEnd length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENST00000429711.7 C 3 0 2094 77 482 2094
## 2 ENST00000429711.7 C 4 0 2094 77 482 2094
## 3 ENST00000429711.7 C 5 0 2094 77 482 2094
## 4 ENST00000429711.7 C 9 0 2094 77 482 2094
## 5 ENST00000429711.7 C 11 0 2094 77 482 2094
## 6 ENST00000429711.7 C 13 0 2094 77 482 2094
## 7 ENST00000429711.7 C 14 0 2094 77 482 2094
## 8 ENST00000429711.7 C 16 0 2094 77 482 2094
## 9 ENST00000429711.7 C 17 0 2094 77 482 2094
## 10 ENST00000429711.7 C 20 0 2094 77 482 2094
## # ℹ 24,134 more rows
## # ℹ 14 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## # genetype2 <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>, type <chr>, seqname <chr>, ref_kmer <chr>,
## # kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## # rel_kmer_middle <dbl>, rel_kmer_end <dbl>
Codon
codons <- c(
'TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC', 'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT', 'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG', 'GGT', 'GGC', 'GGA', 'GGG'
)
methylated_RNAs_CDSseq <-
espresso_transcript_seqs |>
right_join(DRS_methylated_RNAs) |>
left_join(gencode_annotation_CDS) |>
remove_noCDSinfo_RNAs() |>
left_join(DRS_methylated_RNAs_annotation) |>
mutate(CDS_seq = str_sub(transcript_seq, thickStart + 1, thickEnd))
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CDSseq
## # A tibble: 65 × 10
## transcript_id transcript_seq start end thickStart thickEnd gene_name
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 ENST00000429711.7 AGCCCTTGCGCGCCAC… 0 2094 77 482 RPL32
## 2 ENST00000647248.2 CTTCTCTTACCGCCAT… 0 1234 64 394 RPL35A
## 3 ENST00000361390.2 ATACCCATGGCCAACC… 0 956 0 956 MT-ND1
## 4 ENST00000361453.3 ATTAATCCCCTGGCCC… 0 1042 0 1042 MT-ND2
## 5 ENST00000361624.2 ATGTTCGCCGACCGTT… 0 1542 0 1542 MT-CO1
## 6 ENST00000361739.1 ATGGCACATGCAGCGC… 0 684 0 681 MT-CO2
## 7 ENST00000361899.2 ATGAACGAAAATCTGT… 0 681 0 678 MT-ATP6
## 8 ENST00000362079.2 ATGACCCACCAATCAC… 0 784 0 784 MT-CO3
## 9 ENST00000361227.2 ATAAACTTCGCCTTAA… 0 346 0 346 MT-ND3
## 10 ENST00000361381.2 ATGCTAAAACTAATCG… 0 1378 0 1378 MT-ND4
## # ℹ 55 more rows
## # ℹ 3 more variables: gene_type <chr>, genetype2 <chr>, CDS_seq <chr>
min(methylated_RNAs_CDSseq$CDS_seq |> str_length())
## [1] 75
methylated_RNAs_CDSseq %>%
mutate(codon_list = strsplit(CDS_seq, split = "(?<=.{3})", perl = TRUE)) %>%
unnest(codon_list) %>%
group_by(genetype2, codon_list) %>%
summarise(count = n(), .groups = "drop") %>%
filter(codon_list %in% codons) |>
arrange(-count)
## # A tibble: 121 × 3
## genetype2 codon_list count
## <chr> <chr> <int>
## 1 mRNA AAG 627
## 2 mRNA GAG 372
## 3 mRNA CTG 365
## 4 mRNA GCC 360
## 5 mRNA ATC 314
## 6 mRNA AAA 310
## 7 mRNA GCT 290
## 8 mRNA CAG 288
## 9 mRNA GGC 288
## 10 mRNA GTG 288
## # ℹ 111 more rows
DRS_methylated_positions_CDSpos_regioninfo |>
mutate(frame = (kmer_middle - thickEnd) %% 3) |>
filter(frame == 0) |>
mutate(codon = str_sub(ref_kmer, 1,3)) |>
group_by(kmer_region, genetype2, codon) |>
reframe(n = n()) |>
arrange(-n)
## # A tibble: 22 × 4
## kmer_region genetype2 codon n
## <chr> <chr> <chr> <int>
## 1 CDS mRNA GCC 46
## 2 CDS mRNA ACC 20
## 3 CDS mRNA GTC 8
## 4 fiveprimeUTR mRNA GCC 7
## 5 CDS mRNA ATC 6
## 6 threeprimeUTR mRNA ACC 6
## 7 fiveprimeUTR mRNA ACC 5
## 8 CDS mRNA CTC 4
## 9 CDS mRNA TCC 4
## 10 threeprimeUTR mRNA GTC 4
## # ℹ 12 more rows
m3C_relposition_mRNA_metagene |>
filter(kmer_region == 'CDS') |>
select(transcript_id, gene_name, kmer_middle, ref_kmer, rel_position_metagene) |>
arrange(abs(rel_position_metagene - 50)) |>
head(20)
## # A tibble: 20 × 5
## transcript_id gene_name kmer_middle ref_kmer rel_position_metagene
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000395566.9 MDK 520 GACTA 41.0
## 2 ENST00000647248.2 RPL35A 384 CCCCT 40.0
## 3 ENST00000647248.2 RPL35A 383 ACCCC 39.8
## 4 ENST00000343262.9 RPS2 865 CTCCG 39.5
## 5 ENST00000646449.2 RPS26 579 CCCCC 39.5
## 6 ENST00000270625.7 RPS11 475 CACCA 39.4
## 7 ENST00000331825.11 FTL 695 ATCTC 39.1
## 8 ENST00000343262.9 RPS2 852 CCCAC 39.0
## 9 ENST00000343262.9 RPS2 851 ACCCA 39.0
## 10 ENST00000343262.9 RPS2 840 ACCTC 38.6
## 11 ENST00000392514.9 RPLP0 929 GCCAC 37.4
## 12 ENST00000343262.9 RPS2 808 CACCA 37.3
## 13 ENST00000646449.2 RPS26 557 ACCTG 37.3
## 14 ENST00000491306.6 RPL37A 277 GCCAT 37.2
## 15 ENST00000343262.9 RPS2 806 TTCAC 37.2
## 16 ENST00000368719.9 S100A6 301 GCCTT 36.8
## 17 ENST00000343262.9 RPS2 786 ACCTC 36.4
## 18 ENST00000392514.9 RPLP0 899 GCCTT 36.3
## 19 ENST00000343262.9 RPS2 780 CCCCC 36.2
## 20 ENST00000429711.7 RPL32 425 GCCCA 36.1
gencode_annotation_CDS_blocksizes <-
read_bed12(
'/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA_CDS_transcriptome.bed'
) |>
select(chrom, start, end, thickStart, thickEnd, blockSizes) |>
dplyr::rename(transcript_id = chrom) |>
left_join(
read_tsv('/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA.tsv') |>
filter(primary_tag == 'transcript') |>
select(transcript_id, strand)
) |>
mutate( # strandが-のとき逆順にする
blockSizes = ifelse(
strand == 1,
str_split(blockSizes, ',') |> map(as.integer),
str_split(blockSizes, ",") |> map(~rev(as.integer(.x)))
)
) |>
select(-strand)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3422423 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (25): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl (4): start, end, strand, level
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
gencode_annotation_CDS_blocksizes
## # A tibble: 111,324 × 6
## transcript_id start end thickStart thickEnd blockSizes
## <chr> <dbl> <dbl> <dbl> <dbl> <list>
## 1 ENST00000003912.7 0 5481 715 1687 <int [13]>
## 2 ENST00000008440.9 0 1667 128 749 <int [3]>
## 3 ENST00000009105.5 0 2612 245 1673 <int [13]>
## 4 ENST00000010299.10 0 1050 24 1047 <int [10]>
## 5 ENST00000011700.10 0 10969 0 9629 <int [52]>
## 6 ENST00000054650.9 0 1361 156 873 <int [6]>
## 7 ENST00000054666.11 0 2178 88 388 <int [5]>
## 8 ENST00000078527.9 0 2066 160 1639 <int [4]>
## 9 ENST00000164247.5 0 4273 564 1665 <int [16]>
## 10 ENST00000166244.8 0 5019 147 3162 <int [17]>
## # ℹ 111,314 more rows
calc_distance_from_exon_junction <- function(df) {
df |>
left_join(
gencode_annotation_CDS_blocksizes |> select(transcript_id, blockSizes)
) |>
filter(!is.na(blockSizes)) |>
# mutate(
# blockSizes = str_split(blockSizes, ',') |> map(as.integer)
# ) %>%
rowwise() %>%
mutate(
blockIndex = findInterval(kmer_middle, cumsum(c(0, unlist(blockSizes)))),
blockStart = cumsum(c(0, unlist(blockSizes)))[blockIndex],
blockEnd = cumsum(c(0, unlist(blockSizes)))[blockIndex + 1],
distanceFromStart = kmer_middle - blockStart,
distanceFromEnd = blockEnd - kmer_middle + 1,
exonlength = (blockEnd - blockStart),
relativePosition = distanceFromStart / exonlength
) %>%
ungroup()
}
allC_m3C_mRNA_relposition_bound_exonposition <-
allC_m3C_mRNA_relposition_bound |>
#m3C_relposition_mRNA_metagene |>
select(transcript_id, gene_name, end, genetype2, kmer_region, kmer_middle, type) |>
calc_distance_from_exon_junction() |>
mutate(
distanceFromStart = ifelse(
distanceFromStart == kmer_middle, NA, distanceFromStart
),
distanceFromEnd = ifelse(
distanceFromEnd == end - kmer_middle + 1,
NA, distanceFromEnd
)
) |>
rowwise() |>
mutate(
dist_from_exon_junction = min(distanceFromStart, distanceFromEnd, na.rm = TRUE),
) |>
ungroup() |>
filter(!is.na(kmer_region))
## Joining with `by = join_by(transcript_id)`
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `dist_from_exon_junction = min(distanceFromStart,
## distanceFromEnd, na.rm = TRUE)`.
## ℹ In row 19566.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
allC_m3C_mRNA_relposition_bound_exonposition
## # A tibble: 24,141 × 16
## transcript_id gene_name end genetype2 kmer_region kmer_middle type
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 3 allC
## 2 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 4 allC
## 3 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 5 allC
## 4 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 9 allC
## 5 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 11 allC
## 6 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 13 allC
## 7 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 14 allC
## 8 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 16 allC
## 9 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 17 allC
## 10 ENST00000429711.7 RPL32 2094 mRNA fiveprimeUTR 20 allC
## # ℹ 24,131 more rows
## # ℹ 9 more variables: blockSizes <list>, blockIndex <int>, blockStart <dbl>,
## # blockEnd <dbl>, distanceFromStart <dbl>, distanceFromEnd <dbl>,
## # exonlength <dbl>, relativePosition <dbl>, dist_from_exon_junction <dbl>
unique(allC_m3C_mRNA_relposition_bound_exonposition$type)
## [1] "allC" "m3C" "all CC"
unique(allC_m3C_mRNA_relposition_bound_exonposition$kmer_region)
## [1] "fiveprimeUTR" "CDS" "threeprimeUTR"
allC_m3C_mRNA_relposition_bound_exonposition |>
rstatix::wilcox_test(dist_from_exon_junction ~ type, p.adjust.method = 'none')
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 dist_from_… all CC allC 4368 19519 41485745 5 e- 3 5 e- 3 **
## 2 dist_from_… all CC m3C 4368 254 789707 6.21e-30 6.21e-30 ****
## 3 dist_from_… allC m3C 19519 254 3585067 1.94e-34 1.94e-34 ****
ecdf_distance_from_exonjunction <-
allC_m3C_mRNA_relposition_bound_exonposition |>
ggplot(aes(x = dist_from_exon_junction + 1, colour = type)) +
scale_x_log10() +
stat_ecdf(lwd = 1.1) +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_distance_from_exonjunction |>
ggsave_multiple_formats(
outdir = figdir, width = 4, height = 4, fontsize = 7
)
allC_m3C_mRNA_relposition_bound_exonposition |>
group_by(kmer_region) |>
rstatix::wilcox_test(dist_from_exon_junction ~ type, p.adjust.method = 'none')
## # A tibble: 9 × 10
## kmer_region .y. group1 group2 n1 n2 statistic p p.adj
## * <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 CDS dist_from… all CC allC 1730 7490 6495480 8.68e- 1 8.68e- 1
## 2 CDS dist_from… all CC m3C 1730 179 161603 3.35e- 1 3.35e- 1
## 3 CDS dist_from… allC m3C 7490 179 697623 3.52e- 1 3.52e- 1
## 4 fiveprimeUTR dist_from… all CC allC 479 1845 450890 4.91e- 1 4.91e- 1
## 5 fiveprimeUTR dist_from… all CC m3C 479 30 7746 4.73e- 1 4.73e- 1
## 6 fiveprimeUTR dist_from… allC m3C 1845 30 29326. 5.75e- 1 5.75e- 1
## 7 threeprimeUTR dist_from… all CC allC 2159 10184 10809061 2.2 e- 1 2.2 e- 1
## 8 threeprimeUTR dist_from… all CC m3C 2159 45 85392 2.96e-18 2.96e-18
## 9 threeprimeUTR dist_from… allC m3C 10184 45 406584. 2.77e-19 2.77e-19
## # ℹ 1 more variable: p.adj.signif <chr>
ecdf_distance_from_exonjunction_groupedby_region <-
allC_m3C_mRNA_relposition_bound_exonposition |>
ggplot(aes(x = dist_from_exon_junction + 1, colour = type)) +
scale_x_log10() +
stat_ecdf(lwd = 1.1) +
facet_wrap( ~ kmer_region, ncol = 1, scales = 'free') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_distance_from_exonjunction_groupedby_region |>
ggsave_multiple_formats(
outdir = figdir, width = 4, height = 9, fontsize = 7
)
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(!is.na(exonlength)) |>
rstatix::wilcox_test(exonlength ~ type)
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 exonlength all CC allC 4368 19512 41274680 1 e- 3 1 e- 3 **
## 2 exonlength all CC m3C 4368 254 779222. 1.77e-27 3.54e-27 ****
## 3 exonlength allC m3C 19512 254 3538188. 8.18e-32 2.45e-31 ****
ecdf_exon_length <-
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(!is.na(exonlength)) |>
ggplot(aes(x = exonlength + 1, colour = type)) +
scale_x_log10(limits = c(10, 10000)) +
stat_ecdf(lwd = 1.1) +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_exon_length |>
ggsave_multiple_formats(
outdir = figdir, width = 4, height = 4, fontsize = 7
)
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(!is.na(exonlength)) |>
group_by(kmer_region) |>
rstatix::wilcox_test(exonlength ~ type, p.adjust.method = 'none')
## # A tibble: 9 × 10
## kmer_region .y. group1 group2 n1 n2 statistic p p.adj
## * <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 CDS exonlength all CC allC 1730 7490 6515654. 7.12e- 1 7.12e- 1
## 2 CDS exonlength all CC m3C 1730 179 166752. 9 e- 2 9 e- 2
## 3 CDS exonlength allC m3C 7490 179 718260 1.02e- 1 1.02e- 1
## 4 fiveprimeUTR exonlength all CC allC 479 1845 462156. 1.21e- 1 1.21e- 1
## 5 fiveprimeUTR exonlength all CC m3C 479 30 7561 6.31e- 1 6.31e- 1
## 6 fiveprimeUTR exonlength allC m3C 1845 30 27669 9.99e- 1 9.99e- 1
## 7 threeprimeUTR exonlength all CC allC 2159 10177 10647886. 2.4 e- 2 2.4 e- 2
## 8 threeprimeUTR exonlength all CC m3C 2159 45 82104. 1.84e-15 1.84e-15
## 9 threeprimeUTR exonlength allC m3C 10177 45 392004. 1.31e-16 1.31e-16
## # ℹ 1 more variable: p.adj.signif <chr>
ecdf_exon_length_groupedby_region <-
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(!is.na(exonlength)) |>
ggplot(aes(x = exonlength , colour = type)) +
scale_x_log10(limits = c(10, 10000)) +
stat_ecdf(lwd = 1.1) +
facet_wrap( ~ kmer_region, ncol = 1, scales = 'free') +
scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_exon_length_groupedby_region |>
ggsave_multiple_formats(
outdir = figdir, width = 4, height = 9, fontsize = 7
)
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(type == 'm3C') |>
filter(transcript_id == 'ENST00000303204.9') |>
head(20) |>
#filter(grepl('HNRN', gene_name)) |>
#View()
select(
gene_name, transcript_id, kmer_middle, kmer_region, blockIndex,
distanceFromStart, distanceFromEnd
)
## # A tibble: 1 × 7
## gene_name transcript_id kmer_middle kmer_region blockIndex distanceFromStart
## <chr> <chr> <dbl> <chr> <int> <dbl>
## 1 PRELID1 ENST0000030320… 540 CDS 3 35
## # ℹ 1 more variable: distanceFromEnd <dbl>
m3C_relposition_mRNA_metagene |>
filter(grepl('ENST00000552461.5', transcript_id))
## # A tibble: 1 × 20
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000552461.5 RPLP0 chr12 protein_codi… CCCCA 1921 1925
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## # kmer_region <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>
unique(m3C_relposition_mRNA_metagene$transcript_id)
## [1] "ENST00000429711.7" "ENST00000647248.2" "ENST00000215754.8"
## [4] "ENST00000199764.7" "ENST00000270625.7" "ENST00000331825.11"
## [7] "ENST00000229239.10" "ENST00000552551.5" "ENST00000388835.4"
## [10] "ENST00000646449.2" "ENST00000501597.3" "ENST00000202773.14"
## [13] "ENST00000551150.5" "ENST00000552461.5" "ENST00000392514.9"
## [16] "ESPRESSO:chr2:8790:4" "ENST00000233143.6" "ENST00000491306.6"
## [19] "ENST00000358435.9" "ENST00000468812.6" "ENST00000646101.2"
## [22] "ENST00000321153.9" "ENST00000314138.11" "ENST00000395566.9"
## [25] "ENST00000538451.1" "ENST00000620041.4" "ENST00000314133.4"
## [28] "ENST00000273550.12" "ENST00000274065.9" "ENST00000009589.8"
## [31] "ENST00000352983.7" "ENST00000287038.8" "ENST00000530705.6"
## [34] "ENST00000361575.4" "ENST00000369817.7" "ENST00000243997.8"
## [37] "ENST00000343986.9" "ENST00000260379.11" "ENST00000274242.10"
## [40] "ENST00000296674.13" "ENST00000407193.7" "ENST00000303204.9"
## [43] "ENST00000254810.8" "ENST00000556230.2" "ENST00000286953.8"
## [46] "ENST00000361436.10" "ENST00000323345.11" "ENST00000343262.9"
## [49] "ENST00000309268.11" "ENST00000230050.4" "ENST00000613865.5"
## [52] "ENST00000234875.9" "ENST00000270792.10" "ENST00000354332.8"
## [55] "ENST00000368719.9" "ENST00000368811.8" "ENST00000368716.9"
## [58] "ENST00000651669.1" "ENST00000398752.11"
m3C_relposition_mRNA_metagene |>
filter(transcript_id == 'ENST00000368716.9')
## # A tibble: 4 × 20
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000368716.9 S100A4 chr1 protein_codi… GTCCA 108 112
## 2 ENST00000368716.9 S100A4 chr1 protein_codi… GCCAT 316 320
## 3 ENST00000368716.9 S100A4 chr1 protein_codi… GCCAG 404 408
## 4 ENST00000368716.9 S100A4 chr1 protein_codi… TTCCA 446 450
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## # kmer_region <chr>, rel_position_within_region <dbl>,
## # rel_position_metagene <dbl>
m3C_relposition_mRNA_metagene |>
select(transcript_id, gene_name) |>
distinct() |>
arrange(gene_name) |>
export_tsv(basename = 'temp.tsv')
##
## Exported to: Tables/temp.tsv_2024-07-29.tsv
## # A tibble: 59 × 2
## transcript_id gene_name
## <chr> <chr>
## 1 ENST00000646101.2 ARPC1B
## 2 ENST00000398752.11 ATP5F1A
## 3 ENST00000243997.8 ATP5F1E
## 4 ENST00000286953.8 ATP5MJ
## 5 ENST00000199764.7 CEACAM6
## 6 ENST00000314133.4 COX8A
## 7 ENST00000309268.11 EEF1A1
## 8 ENST00000620041.4 FTH1
## 9 ENST00000273550.12 FTH1
## 10 ENST00000331825.11 FTL
## # ℹ 49 more rows
DRS_methylated_positions
## # A tibble: 489 × 13
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCCAC 149 153
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCC 154 158
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCC 155 159
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCA 156 160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
allC_m3C_mRNA_relposition_bound_exonposition |>
filter(type == 'm3C')
## # A tibble: 254 × 16
## transcript_id gene_name end genetype2 kmer_region kmer_middle type
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 ENST00000429711.7 RPL32 2094 mRNA CDS 425 m3C
## 2 ENST00000647248.2 RPL35A 1234 mRNA CDS 383 m3C
## 3 ENST00000647248.2 RPL35A 1234 mRNA CDS 384 m3C
## 4 ENST00000215754.8 MIF 557 mRNA fiveprimeUTR 81 m3C
## 5 ENST00000215754.8 MIF 557 mRNA CDS 182 m3C
## 6 ENST00000215754.8 MIF 557 mRNA CDS 193 m3C
## 7 ENST00000215754.8 MIF 557 mRNA threeprimeUTR 486 m3C
## 8 ENST00000199764.7 CEACAM6 2594 mRNA threeprimeUTR 1700 m3C
## 9 ENST00000270625.7 RPS11 573 mRNA CDS 163 m3C
## 10 ENST00000270625.7 RPS11 573 mRNA CDS 475 m3C
## # ℹ 244 more rows
## # ℹ 9 more variables: blockSizes <list>, blockIndex <int>, blockStart <dbl>,
## # blockEnd <dbl>, distanceFromStart <dbl>, distanceFromEnd <dbl>,
## # exonlength <dbl>, relativePosition <dbl>, dist_from_exon_junction <dbl>
DRS_methylated_positions |>
mutate(start = kmer_middle - 1, end = kmer_middle) |>
select(transcript_id, start, end) |>
write_tsv(file = 'Tables/DRS_m3C_sites/m3C_sites.bed' |> paste_wd(), col_names = FALSE)
gencode_annotation <-
read_tsv('Tables/Database/gencode.v43.annotation.tsv' |> paste_wd())
## Rows: 3422892 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl (4): start, end, strand, level
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gencode_annotation
## # A tibble: 3,422,892 × 28
## seq_id source_tag primary_tag start end score strand frame artif_dupl
## <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 chrY HAVANA gene 253743 255091 . 1 . N/A
## 2 chrY HAVANA transcript 253743 255091 . 1 . N/A
## 3 chrY HAVANA exon 253743 253846 . 1 . N/A
## 4 chrY HAVANA exon 254937 255091 . 1 . N/A
## 5 chrY HAVANA gene 276322 303356 . 1 . N/A
## 6 chrY HAVANA transcript 276322 303353 . 1 . N/A
## 7 chrY HAVANA exon 276322 276394 . 1 . N/A
## 8 chrY HAVANA exon 281482 281684 . 1 . N/A
## 9 chrY HAVANA exon 284167 284314 . 1 . N/A
## 10 chrY HAVANA exon 288733 288869 . 1 . N/A
## # ℹ 3,422,882 more rows
## # ℹ 19 more variables: ccdsid <chr>, exon_id <chr>, exon_number <chr>,
## # gene_id <chr>, gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## # havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## # Parent <chr>, protein_id <chr>, tag <chr>, transcript_id <chr>,
## # transcript_name <chr>, transcript_support_level <chr>,
## # transcript_type <chr>
gencode_annotation |>
# select(transcript_id) |>
right_join(
read_tsv('Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-22.tsv.gz' |> paste_wd()) |>
filter(grepl('..C..', ref_kmer)) |>
select(gene_id) |>
distinct(),
by = join_by(gene_id)
) |>
write_tsv('Tables/Database/gencode.v43.annotation_m3CRNAs.tsv' |> paste_wd())
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.